11. Solution: Build Pipeline
import nltk
nltk.download(['punkt', 'wordnet'])
import re
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
def load_data():
df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
X = df.text.values
y = df.category.values
return X, y
def tokenize(text):
detected_urls = re.findall(url_regex, text)
for url in detected_urls:
text = text.replace(url, "urlplaceholder")
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
clean_tokens = []
for tok in tokens:
clean_tok = lemmatizer.lemmatize(tok).lower().strip()
clean_tokens.append(clean_tok)
return clean_tokens
def display_results(y_test, y_pred):
labels = np.unique(y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
accuracy = (y_pred == y_test).mean()
print("Labels:", labels)
print("Confusion Matrix:\n", confusion_mat)
print("Accuracy:", accuracy)
def main():
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', RandomForestClassifier())
])
# train classifier
pipeline.fit(X_train, y_train)
# predict on test data
y_pred = pipeline.predict(X_test)
# display results
display_results(y_test, y_pred)
main()
Output:
Actual numbers will vary.
Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
[[ 87 0 26]
[ 3 21 8]
[ 4 0 452]]
Accuracy: 0.931780366057